library("igraph")
library("utils")
library("DT")
library("Matrix")
library("dplyr")
library("pheatmap")
library("taxize")
library("BiocFileCache")
library("STRINGdb")

1 First step: Read data

In the first step, we read in the data. The data is expected to be in a tab-seperated format, containing two columns: 1. a column with the name of the genesets and 2. a column with the genes contained in the geneset separated by comma.

getGenes <- function(genesets){
  if(length(genesets) == 0){
    return(list())
  }
  genesets <- lapply(1:nrow(genesets), function(i) {
    toupper(strsplit(genesets[i, 1], " ")[[1]])
  })

  return(genesets)
}
genesets <- utils::read.delim(
          "sample_geneset.txt",
          header = TRUE,
          as.is = TRUE,
          sep = "\t",
          quote = "",
          row.names = 1,
          check.names = FALSE
        )
DT::datatable(genesets)
geneset_names <- seq(1, length(rownames(genesets)))
genes <- getGenes(genesets)

2 Second Step: Scoring the data

After the data has been read in, it can be scored. For this, we currently provide three scores.

getMeetMinMatrix <- function(genesets, geneset_names){
  l <- length(genesets)
  if(l == 0){
    return(-1)
  }
  m <- Matrix::Matrix(0, l, l)

  for(i in 1:(l - 1)){
    a <- genesets[i]
    for(j in (i+1):l){
      b <- genesets[j]
      int <- length(intersect(a, b))

      m[i, j] <- m[j, i] <- 1 - (int / min(length(a), length(b)))
    }
  }

  rownames(m) <- geneset_names
  colnames(m) <- geneset_names
  return(m)
}
calculateKappa <- function(a, b, all_genes){
  set_int <- length(intersect(a, b))
  l <- length(all_genes)

  only_a <- sum(all_genes %in% a & !all_genes %in% b)
  only_b <- sum(!all_genes %in% a & all_genes %in% b)

  background <- l - sum(only_a, only_b, set_int)

  O <- (set_int + background) / l
  E <- (set_int + only_a) * (set_int + only_b) + (only_b + background) * (only_a + background)
  E <- E / l^2

  kappa <- ((O-E) / (1-E))

  if(is.nan(kappa)){
    kappa <- 0
  }
  return(min(abs(1 - kappa), 1))
}


getKappaMatrix <- function(genesets, geneset_names){
  l <- length(genesets)
  if(l == 0){
    return(-1)
  }
  k <- Matrix::Matrix(0, l, l)
  unique_genes <- unique(unlist(genesets))

  for(i in 1:(l - 1)){
    a <- unlist(genesets[i])
    for(j in (i+1):l){
      b <- unlist(genesets[j])
      k[i, j] <- k[j, i] <- calculateKappa(a, b, unique_genes)
    }
  }
  rownames(k) <- geneset_names
  colnames(k) <- geneset_names
  return(k)
}
mmDist <- getMeetMinMatrix(genes, geneset_names)

kappaDist <- getKappaMatrix(genes, geneset_names)

pheatmap::pheatmap(mmDist, show_rownames = F, show_colnames = F)

pheatmap::pheatmap(kappaDist, show_rownames = F, show_colnames = F)

Next, we plot a graph of close genesets.

getAdjacencyMatrix <- function(distanceMatrix, cutOff){
  l <- nrow(distanceMatrix)
  adjMat <- Matrix::Matrix(0, l, l)

  for(i in 1:l){
    edge <- which(distanceMatrix[i, ] <= cutOff)
    no_edge <- which(distanceMatrix[i, ] > cutOff)

    adjMat[i, edge] <- 1
    adjMat[i, no_edge] <- 0
  }
  rownames(adjMat) <- rownames(distanceMatrix)
  colnames(adjMat) <- colnames(distanceMatrix)
  return(adjMat)
}

buildGraph <- function(adjMatrix){
  g <- igraph::graph_from_adjacency_matrix(
    adjMatrix,
    mode = "undirected",
    add.colnames = NULL,
    add.rownames = NA
  )
  return(simplify(g))
}
adj <- getAdjacencyMatrix(mmDist, 0.3)
g <- buildGraph(adj)

plot(g, layout=layout.random, main="random")

adj <- getAdjacencyMatrix(kappaDist, 0.3)
g <- buildGraph(adj)

plot(g,layout=layout.random, main="random")

3 Third Step: Clustering

In the next step, we cluster the data based on the scoring.

checkInclusion <- function(seeds) {
  if(length(seeds) <= 1){
    return(seeds)
  }
  remove <- c()
  for (i in 1:(length(seeds) - 1)) {
    s1 <- seeds[[i]]
    l1 <- length(s1)
    for (j in (i + 1):length(seeds)) {
      s2 <- seeds[[j]]
      l2 <- length(s2)
      if (l1 < l2) {
        if (length(intersect(s1, s2)) == l1) {
          remove <- c(remove, i)
          break
        }
      } else{
        if (length(intersect(s2, s1)) == l2) {
          remove <- c(remove, j)
        }
      }
    }
  }
  remove <- unique(remove)
  if(length(remove) == 0){
    return(seeds)
  }else{
    return(seeds[-remove])
  }
}

seedFinding <- function(distances, simThreshold, memThreshold){
  # simthreshold: what is considered 'close' relationship
  # memthreshold: how many members of a possible seed need a close relationship for the seed to be considered
  seeds <- list()
  
  # Build matrix with xij = 1 indicating i and j are close (sim(i, j) <= simThreshold)
  reach <- apply(distances, 1, function(x) as.numeric(x <= simThreshold))
  for(i in 1:nrow(distances)){
    if(sum(reach[i,], na.rm = TRUE) >= 2){
      members <- which(reach[i, ] == 1)
      includethreshold <- (length(members)^2 - length(members)) * memThreshold
      reach_red <- reach[members, members]
      in_reach <- sum(reach_red)
      if(in_reach >= includethreshold){
        members <- c(members, i)
        seeds <- c(list(sort(unique(members))), seeds)
      }
    }
  }
  seeds <- checkInclusion(unique(seeds))
  return(seeds)
}


clustering <- function(seeds, threshold){
  if(length(seeds) <= 1){
    return(seeds)
  }
  mergeable <- rep(TRUE, length(seeds))
  while(any(mergeable)){
    index <- which(mergeable)[1]
    s1 <- seeds[[index]]
    l <- length(seeds)
    for(j in 1:length(seeds)){
      s2 <- seeds[[j]]
      int <- intersect(s1, s2)
      union <- union(s1, s2)
       if(length(int) >= (threshold * length(union))){
        remove <- list(s1, s2)
        seeds <- seeds[!(seeds %in% remove)]
        seeds <- c(list(union), seeds)
        mergeable <- mergeable[-c(index, j)]
        mergeable <- c(TRUE, mergeable)
        break
       }
    }
    if(l == length(seeds)){
      mergeable[[index]] <- FALSE
    }
    
  }
  return(seeds)
}
mmSeeds <- seedFinding(mmDist, 0.3, 0.5)
mmCluster <- clustering(mmSeeds, 0.5)
kappaSeeds <- seedFinding(kappaDist, 0.3, 0.5)
kappaCluster <- clustering(kappaSeeds, 0.5)

4 Fourth Step: Get the Cluster to a graph

getAdjacencyMatrixCluster <- function(cluster, l){
  adj <- Matrix::Matrix(0, l, l)
  
  for(i in cluster){
    nodes <- unlist(i)
    li <- length(nodes)
    for(j in 1:(li - 1)){
      a <- nodes[j]
      for(k in (j + 1):li){
        b <- nodes[k]
        adj[a, b] <- adj[b, a] <- 1
      }
    }
  }
  
  return(adj)
}
mmClusterAdj <- getAdjacencyMatrixCluster(mmCluster, nrow(mmDist))

mmClustGraph <- buildGraph(mmClusterAdj)
plot(mmClustGraph, layout=layout.random, main="random")

kappaClusterAdj <- getAdjacencyMatrixCluster(kappaCluster, nrow(kappaDist))

kappaClustGraph <- buildGraph(kappaClusterAdj)
plot(kappaClustGraph, layout=layout.random, main="random")

Session information

sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur/Monterey 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] STRINGdb_2.8.4      BiocFileCache_2.4.0 dbplyr_2.1.1       
##  [4] taxize_0.9.100      pheatmap_1.0.12     dplyr_1.0.9        
##  [7] Matrix_1.4-1        DT_0.23             igraph_1.3.1       
## [10] knitr_1.39         
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-157        bitops_1.0-7        bold_1.2.0         
##  [4] bit64_4.0.5         filelock_1.0.2      RColorBrewer_1.1-3 
##  [7] httr_1.4.3          tools_4.2.0         bslib_0.3.1        
## [10] utf8_1.2.2          R6_2.5.1            KernSmooth_2.23-20 
## [13] DBI_1.1.2           colorspace_2.0-3    tidyselect_1.1.2   
## [16] bit_4.0.4           curl_4.3.2          compiler_4.2.0     
## [19] chron_2.3-57        cli_3.3.0           xml2_1.3.3         
## [22] sass_0.4.1          caTools_1.18.2      scales_1.2.0       
## [25] rappdirs_0.3.3      stringr_1.4.0       digest_0.6.29      
## [28] rmarkdown_2.14      pkgconfig_2.0.3     htmltools_0.5.2    
## [31] plotrix_3.8-2       fastmap_1.1.0       highr_0.9          
## [34] htmlwidgets_1.5.4   rlang_1.0.2         rstudioapi_0.13    
## [37] httpcode_0.3.0      RSQLite_2.2.14      jquerylib_0.1.4    
## [40] generics_0.1.2      zoo_1.8-10          jsonlite_1.8.0     
## [43] crosstalk_1.2.0     gtools_3.9.2.1      RCurl_1.98-1.6     
## [46] magrittr_2.0.3      Rcpp_1.0.8.3        munsell_0.5.0      
## [49] fansi_1.0.3         ape_5.6-2           proto_1.0.0        
## [52] lifecycle_1.0.1     sqldf_0.4-11        stringi_1.7.6      
## [55] yaml_2.3.5          gplots_3.1.3        plyr_1.8.7         
## [58] grid_4.2.0          blob_1.2.3          parallel_4.2.0     
## [61] crayon_1.5.1        lattice_0.20-45     conditionz_0.1.0   
## [64] hash_2.2.6.2        pillar_1.7.0        uuid_1.1-0         
## [67] codetools_0.2-18    crul_1.2.0          glue_1.6.2         
## [70] evaluate_0.15       data.table_1.14.2   BiocManager_1.30.18
## [73] vctrs_0.4.1         png_0.1-7           foreach_1.5.2      
## [76] gtable_0.3.0        purrr_0.3.4         reshape_0.8.9      
## [79] assertthat_0.2.1    gsubfn_0.7          cachem_1.0.6       
## [82] xfun_0.31           tibble_3.1.7        iterators_1.0.14   
## [85] memoise_2.0.1       ellipsis_0.3.2      BiocStyle_2.24.0